library(tidyverse)
set.seed(5702)
WeeklySavings <- tibble(A = rexp(n = 52, rate = .015)) %>%
mutate(B = rnorm(n = 52, mean = median(A) - 2,
sd = 50),
C = ifelse(test = B > median(B) + 10,
yes = B * 10, no = B),
D = ifelse(test = B < median(B) - 10,
yes = B - 100, no = B))
WeeklySavings <- WeeklySavings %>%
dplyr::mutate_all(sort)
DT::datatable(WeeklySavings, options = list(paging=F)) %>% DT::formatCurrency(1:4)tidySavings <- gather(WeeklySavings, person, amount)
ggplot(tidySavings, aes(amount)) + geom_histogram(fill = "blue") + facet_wrap(~person)fivenumnames <- c("min", "lower-hinge", "median", "upper-hinge", "max")
D <- WeeklySavings$D
fivenum(D) %>% set_names(fivenumnames)## min lower-hinge median upper-hinge max
## -195.2 -100.5 33.9 57.7 163.1
boxplot(D, horizontal = TRUE, ylim=c(-250, 200))
text(fivenum(D)[c(1,3,5)], 1.25, round(fivenum(D)[c(1,3,5)],1), col = "red")
text(fivenum(D)[c(2,4)], .75, round(fivenum(D),1)[c(2,4)], col = "red")## min lower-hinge median upper-hinge max
## -95.249 -0.473 33.889 577.408 1631.089
boxplot(C, horizontal = TRUE)
text(median(C), 1.25, round(median(C),1), col = "red")
text(fivenum(C)[c(2,4)], .75, round(fivenum(C),1)[c(2,4)], col = "red")b <- boxplot(df$`Weekend Gross`,
horizontal = TRUE, axes = F,
ylim = c(0, 35), lty = "solid",
lwd = 2, border = "blue")
mtext("Weekend Box Office Gross, Top 20",
side = 3, line = -1, cex = 1.5,
font = 2)
mtext("Jan 4-6, 2019",
side = 3, line = -3, cex = 1.2)
mtext("in millions of $", side = 1,
line = .5, cex = 1.2)
axis(1, 0:35, labels = NA, line = -2)
axis(1, seq(0, 35, 5), lwd = 2, line = -2)b <- boxplot(df$`Weekend Gross`,
horizontal = TRUE, axes = F,
ylim = c(0, 35), lty = "solid",
lwd = 2, border = "blue")
mtext("Weekend Box Office Gross, Top 20",
side = 3, line = -1, cex = 1.5,
font = 2)
mtext("Jan 4-6, 2019",
side = 3, line = -3, cex = 1.2)
mtext("in millions of $", side = 1,
line = .5, cex = 1.2)
axis(1, 0:35, labels = NA, line = -2)
axis(1, seq(0, 35, 5), lwd = 2, line = -2)
outliers <- which(df$`Weekend Gross` %in% b$out)
text(b$out, 1.1, df$Title[outliers], col = "red")“H-spread” or fourth spread (upper hinge - lower hinge)
fences:
1.5 x hinge spread above upper-hinge
1.5 x hinge spread below lower-hinge
b <- boxplot(df$`Weekend Gross`,
horizontal = TRUE, axes = F,
ylim = c(-10, 20), lty = "solid",
lwd = 2, border = "blue")
f <- fivenum(df$`Weekend Gross`)
fences <- c(f[2],f[4]) + c(-1,1)*1.5*(f[4]-f[2])
abline(v = fences, col = "green", lwd = 2)fences:
1.5 x hinge spread above upper-hinge
1.5 x hinge spread below lower-hinge
b <- boxplot(df$`Weekend Gross`, axes = F,
ylim = c(-30, 45), lty = "solid",
lwd = 2, border = "blue")
innerfences <- c(f[2],f[4]) + c(-1,1)*1.5*(f[4]-f[2])
outerfences <- c(f[2],f[4]) + c(-1,1)*3*(f[4]-f[2])
abline(h = innerfences, col = "green", lwd = 2)
text(1, innerfences[1]+1.5, "inner fence (1.5 times hinge-spread from hinge)", col = "green")
abline(h = outerfences, col = "darkgreen", lwd = 2)
text(1, outerfences[1]+1.5, "outer fence (3 times hinge-spread from hinge)", col = "darkgreen")
diff <- outerfences[2] - innerfences[2]
text(1, diff/2 + innerfences[2]+2, "outside values", col = "blue")
text(1, diff/2 + outerfences[2], "far out values", col = "blue")## [1] 0.703 0.923 1.005 1.168 1.609 1.808 1.843 1.903 2.147 2.368
## [11] 3.303 4.674 4.755 5.735 9.110 13.127 13.203 15.861 18.238 31.003
## min lower-hinge median upper-hinge max
## 0.703 1.709 2.835 11.118 31.003
## 0% 25% 50% 75% 100%
## 0.703 1.758 2.835 10.114 31.003
See: ?quantile for different methods
Sometimes boxplots are drawn using the IQR (interquartile range) instead of hinge spread
## $stats
## [1] 0.703 1.709 2.835 11.118 18.238
##
## $n
## [1] 20
##
## $conf
## [1] -0.489 6.160
##
## $out
## [1] 31
| ymin | lower | middle | upper | ymax | outliers | notchupper | notchlower | x | PANEL | group | ymin_final | ymax_final | xmin | xmax | xid | newx | new_width | weight | colour | fill | size | alpha | shape | linetype |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.703 | 1.76 | 2.83 | 10.1 | 18.2 | 31.00328 | 5.79 | -0.117 | 1 | 1 | -1 | 0.703 | 31 | 0.625 | 1.38 | 1 | 1 | 0.75 | 1 | grey20 | white | 0.5 | NA | 19 | solid |
library(ggplot2)
world <- read.csv("countries2012.csv")
ggplot(world, aes(x = CONTINENT, y = TFR)) + geom_boxplot()| COUNTRY | CONTINENT | TFR |
|---|---|---|
| Afghanistan | Asia | 5.27 |
| Timor-Leste | Asia | 5.30 |
| COUNTRY | CONTINENT | TFR |
|---|---|---|
| Australia | Oceania | 1.92 |
library(dplyr)
tfrorderdesc <- world %>% group_by(CONTINENT) %>%
summarize(count = n(), median = median(TFR)) %>% arrange(-median)
g2 + annotate("text", x=1:6, y = 8,
label = tfrorderdesc$count, color = "blue",
size = 6) +
ggtitle("count:") + theme_grey(14) +
theme(plot.title = element_text(color = "blue"))gb <- ggplot(world, aes(x = reorder(CONTINENT, TFR, median),
y = TFR)) +
geom_boxplot(varwidth = TRUE) +
coord_flip() + theme_grey(14)
gblibrary(likert)
data("pisaitems")
p <- pisaitems[1:100, 2:7] %>%
dplyr::mutate_all(as.integer) %>%
dplyr::filter(complete.cases(.))
boxplot(p, las = 1, main = "PISA data (scale: 1 - 4)",
border = "blue")Source: R likert::pisaitems dataset
gh <- ggplot(world, aes(x = TFR, y = ..density..)) +
geom_histogram(color = "blue", fill = "lightblue") +
facet_wrap(~reorder(CONTINENT, -TFR, median), nrow = 6, strip.position = "top") +
theme(strip.placement = "outside",
strip.background = element_blank(),
strip.text = element_text(face = "bold"))
ghggplot(world, aes(x = GDP)) +
geom_histogram(fill = "white", color = "lightblue") +
geom_freqpoly() + theme_grey(14)g <- ggplot(world, aes(x = GDP, y = ..density..)) +
geom_histogram(fill = "white", color = "lightblue")
gg + geom_density(color = "red", bw = 2000) +
geom_density(color = "blue", bw = 4000) +
geom_density(color = "green", bw = 16000) +
geom_density(color = "purple", bw = 1000)ggvis)library(ggvis)
world %>% ggvis(~GDP) %>%
layer_densities(adjust = input_slider(.1, 5,
label = "bandwidth"))See also: http://ggvis.rstudio.com/0.1/quick-examples.html#histograms
library(dplyr)
america <- world %>% filter(grepl("America", CONTINENT))
ggplot(america, aes(x = GDP, color = CONTINENT, fill = CONTINENT)) +
geom_density(alpha = .2)library(dplyr)
america <- world %>% filter(grepl("America", CONTINENT))
ggplot(america, aes(x = GDP, color = CONTINENT, fill = CONTINENT)) +
geom_density(alpha = .2, adjust = .5)ggplot(world, aes(x = CONTINENT,
y = GDP)) +
geom_violin() +
ggtitle("Gross Domestic Product") +
coord_flip() + theme_grey(14)ggplot(world, aes(x = CONTINENT,
y = GDP)) +
geom_violin(adjust = 6) +
ggtitle("Gross Domestic Product") +
coord_flip() + theme_grey(14)ggplot(world, aes(x = reorder(CONTINENT, GDP, median),
y = GDP)) +
geom_violin() +
ggtitle("Gross Domestic Product") +
coord_flip() + theme_grey(14)library(gridExtra)
g1 <- ggplot(world, aes(x = factor(1), y = TFR)) +
geom_boxplot() + xlab("") + coord_flip()
g2 <- ggplot(world, aes(x = factor(1), y = TFR)) +
geom_violin() + xlab("") + coord_flip()
grid.arrange(g1, g2, ncol = 1)Source: https://eagereyes.org/blog/2017/joy-plots
Additional resources:
Jocelyn Bell discovers first radio pulsars, 1967
library(ggridges)
gr <- ggplot(world, aes(x = GDP, y = reorder(CONTINENT, -GDP,
median))) +
geom_density_ridges(fill = "blue", alpha = .5)
grggplot(world, aes(x = GDP,
y = reorder(CONTINENT, -GDP, median))) +
geom_density_ridges(fill = "blue", alpha = .5, scale = 1)ggridge packageCRAN https://CRAN.R-project.org/package=ggridges
Github https://github.com/clauswilke/ggridges
Package vignette(s) https://cran.r-project.org/web/packages/ggridges/vignettes/introduction.html
https://cran.r-project.org/web/packages/ggridges/vignettes/gallery.html
Package manual https://cran.r-project.org/web/packages/ggridges/ggridges.pdf